According to research, climate change has already brought multiple observable impacts to our environment. Glaciers have shrunk and a number of animal and plant species are in danger of extinction due to climate change. Such impacts can fundamentally transform whole ecosystems and the intricate webs of life. Furthermore, it has a significant effect on our livelihoods, health, and future.
There is no more time to wait. Although we cannot stop climate change overnight, we still can slow down the pace of it. And for this, we first must understand how the climate is changing and why it is happening.
Thus, in this research, we examine some representative scientific evidence of climate changing, and attempt to model and extrapolate the global mean temperature using various prediction models.
The data is stored in data directory, containing 7 files as mentioned below:
amo.txt: The mean Sea Surface Temperature (SST) of North Atlantic, i.e., within the latitude 0 °-70 °N, detrended to remove the influence of global warming. See this link.co2.txt: The long-time yearly time series of the concentration of CO2.nao.txt: An index calculated from the measurements of air pressure at two locations: Ponta Delgada, Azores, and Stykkisholmur/Reykjavik in Iceland. See this link.spot_num.txt: The number of sun spots. See this link.temp.csv: Global average temperature by year. See this link.volcano.csv: Volcanic activities.import pandas as pd
import datetime
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import itertools
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoCV
from sklearn import ensemble
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy import stats
from scipy.optimize import curve_fit
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from scipy.stats import pearsonr
import plotly.graph_objs as go
import plotly.express as ex
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import warnings
warnings.filterwarnings('ignore')
import logging, sys
logging.disable(sys.maxsize)
As the data is monthly, we transform all timestamps into the beginning (first date) of the month.
df_temp = pd.read_csv("data/global_surface_temperature_anomalies.csv")
df_temp.head()
| date | Avg_Anomaly_deg_C | |
|---|---|---|
| 0 | 1880-01-31 | -0.29 |
| 1 | 1880-02-29 | -0.18 |
| 2 | 1880-03-31 | -0.11 |
| 3 | 1880-04-30 | -0.19 |
| 4 | 1880-05-31 | -0.11 |
df_temp['date'] = pd.to_datetime(df_temp['date'])-pd.offsets.MonthBegin(1)
df_temp.set_index('date', inplace=True)
df_temp = df_temp.rename(columns={"Avg_Anomaly_deg_C":"temp"})
df_temp.head()
| temp | |
|---|---|
| date | |
| 1880-01-01 | -0.29 |
| 1880-02-01 | -0.18 |
| 1880-03-01 | -0.11 |
| 1880-04-01 | -0.19 |
| 1880-05-01 | -0.11 |
fig = go.Figure(layout=go.Layout(
xaxis=dict(title="Year", color='black'),
yaxis=dict(title="°C", color = 'black')
))
fig.add_trace(go.Scatter(
x=df_temp.index,
y=df_temp.temp,
name="Temperature",
line_width=1.5,
opacity=0.8))
fig.update_layout(title_text="Temperature anomalies (°C)",
title_x=0.5,
title_font_size=22,
paper_bgcolor= 'rgb(245, 246, 250)',
plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
col_month = "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec".split(" ")
df_amo = pd.read_csv("data/amo.txt",sep=" ", index_col=0, names = col_month) # 3 spaces
df_amo.head()
| Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1856 | 0.237 | 0.170 | 0.241 | 0.160 | 0.213 | 0.235 | 0.248 | 0.225 | 0.292 | 0.142 | 0.153 | 0.247 |
| 1857 | 0.232 | -0.041 | -0.056 | 0.026 | -0.014 | 0.118 | 0.150 | 0.016 | 0.037 | -0.116 | -0.150 | -0.265 |
| 1858 | -0.203 | -0.296 | -0.053 | 0.229 | 0.107 | -0.022 | -0.143 | -0.053 | 0.088 | 0.166 | 0.293 | 0.140 |
| 1859 | -0.115 | -0.103 | 0.077 | 0.167 | 0.104 | -0.084 | -0.163 | 0.080 | 0.097 | 0.145 | 0.077 | 0.099 |
| 1860 | 0.132 | -0.095 | 0.078 | -0.052 | 0.186 | 0.320 | 0.246 | 0.056 | 0.012 | 0.127 | 0.009 | 0.158 |
df_amo.reset_index(inplace=True)
df_amo.rename(columns={'index':'year'}, inplace=True)
df_amo = df_amo.melt(id_vars='year', var_name='month')
df_amo['date'] = pd.to_datetime(df_amo['year'].astype(str) + '-' + df_amo['month'])
df_amo = df_amo.sort_values(by='date')
df_amo.set_index('date', inplace=True)
df_amo["value"] = pd.to_numeric(df_amo["value"], errors="coerce")
df_amo = df_amo.rename(columns={"value":"amo"})
df_amo = df_amo[["amo"]]
df_amo.head()
| amo | |
|---|---|
| date | |
| 1856-01-01 | 0.237 |
| 1856-02-01 | 0.170 |
| 1856-03-01 | 0.241 |
| 1856-04-01 | 0.160 |
| 1856-05-01 | 0.213 |
fig = go.Figure(layout=go.Layout(
xaxis=dict(title="Year", color='black'),
yaxis=dict(title="°C", color = 'black')
))
fig.add_trace(go.Scatter(
x=df_amo.index,
y=df_amo.amo,
name="Temperature",
line_width=1.5,
opacity=0.8))
fig.update_layout(title_text="Sea Surface Temperature of North Atlantic (°C)",
title_x=0.5,
title_font_size=22,
paper_bgcolor= 'rgb(245, 246, 250)',
plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
df_co2 = pd.read_csv("data/co2.csv")
df_co2.head()
| Year | Month | Decimal Date | Carbon Dioxide (ppm) | Seasonally Adjusted CO2 (ppm) | Carbon Dioxide Fit (ppm) | Seasonally Adjusted CO2 Fit (ppm) | |
|---|---|---|---|---|---|---|---|
| 0 | 1958 | 1 | 1958.0411 | NaN | NaN | NaN | NaN |
| 1 | 1958 | 2 | 1958.1260 | NaN | NaN | NaN | NaN |
| 2 | 1958 | 3 | 1958.2027 | 315.69 | 314.42 | 316.18 | 314.89 |
| 3 | 1958 | 4 | 1958.2877 | 317.45 | 315.15 | 317.30 | 314.98 |
| 4 | 1958 | 5 | 1958.3699 | 317.50 | 314.73 | 317.83 | 315.06 |
We use smoothened seasonally adjusted CO2 values:
df_co2["date"] = pd.to_datetime(df_co2["Year"].astype(str) + "-" + df_co2["Month"].astype(str))
df_co2.set_index("date", inplace=True)
df_co2 = df_co2.rename(columns={"Seasonally Adjusted CO2 Fit (ppm)":"co2"})
df_co2.co2 = pd.to_numeric(df_co2.co2)
df_co2 = df_co2[["co2"]]
df_co2.head()
| co2 | |
|---|---|
| date | |
| 1958-01-01 | NaN |
| 1958-02-01 | NaN |
| 1958-03-01 | 314.89 |
| 1958-04-01 | 314.98 |
| 1958-05-01 | 315.06 |
fig = go.Figure(layout=go.Layout(
xaxis=dict(title="Year", color='black'),
yaxis=dict(title="ppm", color = 'black')
))
fig.add_trace(go.Scatter(
x=df_co2.index,
y=df_co2.co2,
name="CO2",
line_width=1.5,
opacity=0.8))
fig.update_layout(title_text="CO2 emission (ppm)",
title_x=0.5,
title_font_size=22,
paper_bgcolor= 'rgb(245, 246, 250)',
plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
col_month = "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec".split(" ")
df_nao = pd.read_csv("data/nao.txt",sep=" ",index_col=0, names = col_month) # 2 spaces
df_nao.head()
| Jan | Feb | Mar | Apr | May | Jun | Jul | Aug | Sep | Oct | Nov | Dec | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1950 | 0.92 | 0.40 | -0.36 | 0.73 | -0.59 | -0.06 | -1.26 | -0.05 | 0.25 | 0.85 | -1.26 | -1.02 |
| 1951 | 0.08 | 0.70 | -1.02 | -0.22 | -0.59 | -1.64 | 1.37 | -0.22 | -1.36 | 1.87 | -0.39 | 1.32 |
| 1952 | 0.93 | -0.83 | -1.49 | 1.01 | -1.12 | -0.40 | -0.09 | -0.28 | -0.54 | -0.73 | -1.13 | -0.43 |
| 1953 | 0.33 | -0.49 | -0.04 | -1.67 | -0.66 | 1.09 | 0.40 | -0.71 | -0.35 | 1.32 | 1.04 | -0.47 |
| 1954 | 0.37 | 0.74 | -0.83 | 1.34 | -0.09 | -0.25 | -0.60 | -1.90 | -0.44 | 0.60 | 0.40 | 0.69 |
df_nao.reset_index(inplace=True)
df_nao.rename(columns={'index': 'year'}, inplace=True)
df_nao = df_nao.melt(id_vars='year', var_name='month')
df_nao['date'] = pd.to_datetime(df_nao['year'].astype(str) + '-' + df_nao['month'])
df_nao.sort_values(by='date', inplace=True)
df_nao.set_index('date', inplace=True)
df_nao["value"] = pd.to_numeric(df_nao["value"], errors="coerce")
df_nao = df_nao.rename(columns={"value":"nao"})
df_nao = df_nao[["nao"]]
df_nao.head()
| nao | |
|---|---|
| date | |
| 1950-01-01 | 0.92 |
| 1950-02-01 | 0.40 |
| 1950-03-01 | -0.36 |
| 1950-04-01 | 0.73 |
| 1950-05-01 | -0.59 |
fig = go.Figure(layout=go.Layout(
xaxis=dict(title="Year", color='black'),
yaxis=dict(color = 'black')
))
fig.add_trace(go.Scatter(
x=df_nao.index,
y=df_nao.nao,
name="NAO",
line_width=1.5,
opacity=0.8))
fig.update_layout(title_text="North Atlantic Oscillation",
title_x=0.5,
title_font_size=22,
paper_bgcolor= 'rgb(245, 246, 250)',
plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
df_ssn = pd.read_csv("data/SN_m_tot_V2.0.csv",sep=";", index_col=2,names=["YEAR","MON","ir1","ssn","ir2","ir3"])
df_ssn.head()
| YEAR | MON | ir1 | ssn | ir2 | ir3 | |
|---|---|---|---|---|---|---|
| 1749.042 | 1749 | 1 | 96.7 | -1.0 | -1 | 1 |
| 1749.123 | 1749 | 2 | 104.3 | -1.0 | -1 | 1 |
| 1749.204 | 1749 | 3 | 116.7 | -1.0 | -1 | 1 |
| 1749.288 | 1749 | 4 | 92.8 | -1.0 | -1 | 1 |
| 1749.371 | 1749 | 5 | 141.7 | -1.0 | -1 | 1 |
df_ssn["date"] = pd.to_datetime(df_ssn.YEAR.astype("str")+'-'+df_ssn.MON.astype(str))
df_ssn.set_index('date', inplace=True)
df_ssn.rename(columns={'SSN': 'ssn'}, inplace=True)
df_ssn["ssn"] = pd.to_numeric(df_ssn["ssn"],errors="coerce")
df_ssn.ssn.replace({-1.0:np.nan}, inplace=True)
df_ssn = df_ssn[df_ssn.index>='1818-01-01']
df_ssn = df_ssn[["ssn"]]
df_ssn.head()
| ssn | |
|---|---|
| date | |
| 1818-01-01 | 9.7 |
| 1818-02-01 | 7.8 |
| 1818-03-01 | 8.3 |
| 1818-04-01 | 9.6 |
| 1818-05-01 | 11.9 |
fig = go.Figure(layout=go.Layout(
xaxis=dict(title="Year", color='black'),
yaxis=dict(color = 'black')
))
fig.add_trace(go.Scatter(
x=df_ssn.index,
y=df_ssn.ssn,
name="NAO",
line_width=1.5,
opacity=0.8))
fig.update_layout(title_text="North Atlantic Oscillation",
title_x=0.5,
title_font_size=22,
paper_bgcolor= 'rgb(245, 246, 250)',
plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
df_vol = pd.read_csv("data/volcano.csv",sep=";")
df_vol.head()
| Year | Month | Day | Associated Tsunami? | Associated Earthquake? | Name | Location | Country | Latitude | Longitude | ... | TOTAL_DEATHS | TOTAL_DEATHS_DESCRIPTION | TOTAL_MISSING | TOTAL_MISSING_DESCRIPTION | TOTAL_INJURIES | TOTAL_INJURIES_DESCRIPTION | TOTAL_DAMAGE_MILLIONS_DOLLARS | TOTAL_DAMAGE_DESCRIPTION | TOTAL_HOUSES_DESTROYED | TOTAL_HOUSES_DESTROYED_DESCRIPTION | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -4360 | NaN | NaN | NaN | NaN | Macauley Island | Kermadec Is | New Zealand | -30,2 | -178,47 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | -4350 | NaN | NaN | NaN | NaN | Kikai | Ryukyu Is | Japan | 30,78 | 130,28 | ... | NaN | 3.0 | NaN | NaN | NaN | NaN | NaN | 3.0 | NaN | 3.0 |
| 2 | -4050 | NaN | NaN | NaN | NaN | Masaya | Nicaragua | Nicaragua | 11,984 | -86,161 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | -4000 | NaN | NaN | NaN | NaN | Pago | New Britain-SW Pac | Papua New Guinea | -5,58 | 150,52 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | -3580 | NaN | NaN | NaN | NaN | Taal | Luzon-Philippines | Philippines | 14,002 | 120,993 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 36 columns
df_vol = df_vol[~df_vol["Month"].isna()]
df_vol = df_vol[df_vol["Year"]>=1749]
df_vol = df_vol[df_vol["Volcano Explosivity Index (VEI)"] > 0]
df_vol = df_vol.rename(columns={"Volcano Explosivity Index (VEI)":"vei"})
df_vol_group = df_vol.groupby(["Year", "Month"])["vei"].sum().reset_index()
df_vol_group["date"] = pd.to_datetime(df_vol_group.Year.astype(str)+'-'+df_vol_group.Month.astype(int).astype(str))
df_vol_group['date'] = pd.to_datetime(df_vol_group['date'])-pd.offsets.MonthBegin(1)
df_vol_group.set_index("date",inplace=True)
df_vol_group = df_vol_group[["vei"]]
df_vol_group.head()
| vei | |
|---|---|
| date | |
| 1749-07-01 | 3.0 |
| 1753-09-01 | 2.0 |
| 1754-04-01 | 4.0 |
| 1755-09-01 | 4.0 |
| 1760-08-01 | 4.0 |
fig = go.Figure(layout=go.Layout(
xaxis=dict(title="Year", color='black'),
yaxis=dict(color = 'black')
))
fig.add_trace(go.Scatter(
x=df_vol_group.index,
y=df_vol_group.vei,
name="NAO",
line_width=1.5,
opacity=0.8))
fig.update_layout(title_text="Volcano Explosivity Index",
title_x=0.5,
title_font_size=22,
paper_bgcolor= 'rgb(245, 246, 250)',
plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
df = df_temp.join(df_amo,how="outer")
df = df.join(df_co2,how="outer")
#df = df.join(df_enso,how="outer")
df = df.join(df_nao,how="outer")
df = df.join(df_vol_group,how="outer")
df = df.join(df_ssn,how="outer")
df = df[~df.temp.isnull()]
df = df[~df.co2.isnull()]
df
| temp | amo | co2 | nao | vei | ssn | |
|---|---|---|---|---|---|---|
| date | ||||||
| 1958-03-01 | 0.11 | 0.341 | 314.89 | -1.96 | NaN | 11.0 |
| 1958-04-01 | 0.03 | 0.343 | 314.98 | 0.37 | NaN | 11.2 |
| 1958-05-01 | 0.06 | 0.198 | 315.06 | -0.24 | NaN | 10.5 |
| 1958-06-01 | -0.08 | 0.218 | 315.14 | -1.38 | 2.0 | 10.4 |
| 1958-07-01 | 0.05 | 0.161 | 315.21 | -1.73 | NaN | 11.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 2016-09-01 | 0.87 | 0.447 | 404.85 | 0.61 | NaN | 3.8 |
| 2016-10-01 | 0.89 | 0.370 | 405.09 | 0.41 | NaN | 3.1 |
| 2016-11-01 | 0.90 | 0.380 | 405.34 | -0.16 | NaN | 2.6 |
| 2016-12-01 | 0.83 | 0.325 | 405.58 | 0.48 | NaN | 2.4 |
| 2017-01-01 | 0.98 | 0.215 | 405.83 | 0.48 | NaN | 2.5 |
707 rows × 6 columns
df.isnull().sum()
temp 0 amo 0 co2 0 nao 0 vei 577 ssn 0 dtype: int64
sns.heatmap(df.corr())
plt.show()
# Prepare data and split to train-test set
df_copy = df.copy()
df_copy.drop('vei', axis=1, inplace=True)
df_copy.reset_index(inplace=True)
df_copy = df_copy.rename(columns={"date":"ds","temp":"y"})
df_train, df_test = train_test_split(df_copy, test_size=0.2, shuffle=False)
m = Prophet()
m.fit(df_train)
forecast = m.predict(df_test.drop(columns="y"))
m.plot(forecast)
m.plot_components(forecast)
mean_absolute_error(df_test.y, forecast.yhat)
0.10989883625534834
Cross validation:
df_cv = cross_validation(m, initial='1460 days', period='180 days', horizon = '365 days', parallel="processes")
df_p = performance_metrics(df_cv)
df_p.mean()
horizon 201 days 00:00:00 mse 0.024263 rmse 0.155467 mae 0.123468 mdape 0.537091 coverage 0.73485 dtype: object
# Prepare data and
df_copy = df.copy()
df_copy.drop('vei', axis=1, inplace=True)
df_copy.reset_index(inplace=True)
df_copy = df_copy.rename(columns={"date":"ds","temp":"y"})
# Drop nan as prophet doesn't accept nan in addtional regressors
df_copy.dropna(inplace=True)
# Split to train-test set
df_train, df_test = train_test_split(df_copy, test_size=0.2, shuffle=False)
df_test
| ds | y | amo | co2 | nao | ssn | |
|---|---|---|---|---|---|---|
| 565 | 2005-04-01 | 0.67 | 0.283 | 379.07 | -0.30 | 3.8 |
| 566 | 2005-05-01 | 0.62 | 0.284 | 379.29 | -1.25 | 5.7 |
| 567 | 2005-06-01 | 0.66 | 0.318 | 379.51 | -0.05 | 5.3 |
| 568 | 2005-07-01 | 0.64 | 0.438 | 379.72 | -0.51 | 6.1 |
| 569 | 2005-08-01 | 0.60 | 0.431 | 379.94 | 0.37 | 5.3 |
| ... | ... | ... | ... | ... | ... | ... |
| 702 | 2016-09-01 | 0.87 | 0.447 | 404.85 | 0.61 | 3.8 |
| 703 | 2016-10-01 | 0.89 | 0.370 | 405.09 | 0.41 | 3.1 |
| 704 | 2016-11-01 | 0.90 | 0.380 | 405.34 | -0.16 | 2.6 |
| 705 | 2016-12-01 | 0.83 | 0.325 | 405.58 | 0.48 | 2.4 |
| 706 | 2017-01-01 | 0.98 | 0.215 | 405.83 | 0.48 | 2.5 |
142 rows × 6 columns
# Add volcanic activities as holidays
vol_act = pd.DataFrame({
'holiday': 'vol_act',
'ds': pd.to_datetime(df_vol_group.index),
'lower_window': 0,
'upper_window': 1,
})
m = Prophet(holidays = vol_act)
# Add other data as additional regressors
m.add_regressor("amo")
m.add_regressor("co2")
m.add_regressor("nao")
m.fit(df_train)
forecast = m.predict(df_test.drop(columns="y"))
m.plot(forecast)
m.plot_components(forecast)
mean_absolute_error(df_test.y, forecast.yhat)
0.10364185528638496
df_cv = cross_validation(m, initial='1460 days', period='180 days', horizon = '365 days', parallel='processes')
df_p = performance_metrics(df_cv)
df_p.mean()
horizon 201 days 00:00:00 mse 0.020883 rmse 0.143981 mae 0.111453 mdape 0.441239 coverage 0.738582 dtype: object
fig = plot_cross_validation_metric(df_cv, metric='mae')
fig
import itertools
import numpy as np
import pandas as pd
param_grid = {
'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = [] # Store the RMSEs for each params here
maes = [] # Store the MAEs for each params here
# Use cross validation to evaluate all parameters
from tqdm import tqdm
for params in tqdm(all_params):
m = Prophet(**params) # Fit model with given params
m.add_regressor("amo")
m.add_regressor("co2")
m.add_regressor("nao")
m.fit(df_train)
df_cv = cross_validation(m, horizon='365 days', parallel='processes')
df_p = performance_metrics(df_cv, rolling_window=1)
rmses.append(df_p['rmse'].values[0])
maes.append(df_p['mae'].values[0])
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [24:54<00:00, 93.42s/it]
tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
tuning_results['mae'] = maes
tuning_results
| changepoint_prior_scale | seasonality_prior_scale | rmse | mae | |
|---|---|---|---|---|
| 0 | 0.001 | 0.01 | 0.148446 | 0.116462 |
| 1 | 0.001 | 0.10 | 0.149189 | 0.116798 |
| 2 | 0.001 | 1.00 | 0.150007 | 0.117398 |
| 3 | 0.001 | 10.00 | 0.150527 | 0.117491 |
| 4 | 0.010 | 0.01 | 0.144553 | 0.112034 |
| 5 | 0.010 | 0.10 | 0.145130 | 0.112152 |
| 6 | 0.010 | 1.00 | 0.144842 | 0.111610 |
| 7 | 0.010 | 10.00 | 0.146662 | 0.113155 |
| 8 | 0.100 | 0.01 | 0.146218 | 0.112975 |
| 9 | 0.100 | 0.10 | 0.147283 | 0.113842 |
| 10 | 0.100 | 1.00 | 0.147119 | 0.113504 |
| 11 | 0.100 | 10.00 | 0.147882 | 0.114256 |
| 12 | 0.500 | 0.01 | 0.151770 | 0.118454 |
| 13 | 0.500 | 0.10 | 0.153639 | 0.119808 |
| 14 | 0.500 | 1.00 | 0.155293 | 0.120719 |
| 15 | 0.500 | 10.00 | 0.155594 | 0.120924 |
from fbprophet import Prophet
m = Prophet(changepoint_prior_scale=0.01, seasonality_prior_scale=0.01)
m.add_regressor("amo")
m.add_regressor("co2")
m.add_regressor("nao")
m.fit(df_train)
forecast = m.predict(df_test.drop(columns="y"))
mean_absolute_error(df_test.y, forecast.yhat)
0.10293793144158289
df_cv = cross_validation(m, initial='1460 days', period='180 days', horizon = '365 days', parallel='processes')
df_p = performance_metrics(df_cv)
df_p.mean()
horizon 201 days 00:00:00 mse 0.020292 rmse 0.141751 mae 0.111366 mdape 0.452844 coverage 0.747546 dtype: object
fig = go.Figure( layout=go.Layout(xaxis=dict(title = "Year", color = 'black'),
yaxis=dict(title = "Orders", color = 'black'),
))
fig.add_trace(go.Scatter(
x=df_test.ds,
y=df_test.y,
name='y_true',
line_width = 1.5,
opacity=0.8))
fig.add_trace(go.Scatter(
x=forecast.ds,
y=forecast.yhat,
name='y_pred',
line_width = 1.5,
opacity=0.8))
# Use date string to set xaxis range
fig.update_layout(title_text = "Temperature forecast", title_x=0.5, title_font_size = 22,
paper_bgcolor = 'rgb(245, 246, 250)', plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
Hyperparameters for the Random Forest model can be found with Scikit-Learn's Random Search Cross Validation. It performs K-fold CV in such sets of parameter that randomly sample from a predefined parameter grid, and return the best one.
from datetime import datetime as dt
df_copy = df.copy()
df_copy.drop('vei', axis=1, inplace=True)
df_train, df_test = train_test_split(df_copy, test_size=0.2, shuffle=False)
X_train = df_train.drop('temp', axis=1)
y_train = df_train['temp']
X_test = df_test.drop('temp', axis=1)
y_test = df_test['temp']
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Hyperparameters
random_grid = {'n_estimators': [int(x) for x in np.linspace(200, 2000, num = 10)],# Number of trees in random forest
'max_features': ['auto', 'sqrt'], # Number of features to consider at every split
'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)]+[None], # Maximum number of levels in tree
'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at each leaf node
'bootstrap': [True, False]} # Method of selecting samples for training each tree
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf , param_distributions = random_grid,
n_iter = 50, cv = 3, verbose=1, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_
Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'n_estimators': 400,
'min_samples_split': 10,
'min_samples_leaf': 4,
'max_features': 'auto',
'max_depth': 70,
'bootstrap': True}
Using Random Search enables narrowing down the focus on each parameter. Now we can start to search "around" with Scikit-Learn's Grid Search which validate all combinations from a parameter grid:
# Parameter grid based on the results of random search
def generate_param_grid_for_grid_search(random_params):
grid_params = random_params.copy()
for k in grid_params:
grid_params[k] = [grid_params[k]]
def helper(grid, key, step):
grid[key] = [grid[key][0]-step, grid[key][0], grid[key][0]+step]
helper(grid_params, 'max_depth', 10)
helper(grid_params, 'min_samples_leaf', 1)
helper(grid_params, 'min_samples_split', 1)
helper(grid_params, 'n_estimators', 100)
return grid_params
param_grid = generate_param_grid_for_grid_search(rf_random.best_params_)
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
cv = 3, n_jobs = -1, verbose = 1)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
param_grid={'bootstrap': [True], 'max_depth': [60, 70, 80],
'max_features': ['auto'],
'min_samples_leaf': [3, 4, 5],
'min_samples_split': [9, 10, 11],
'n_estimators': [300, 400, 500]},
verbose=1)
grid_search.best_estimator_
RandomForestRegressor(max_depth=60, min_samples_leaf=5, min_samples_split=9,
n_estimators=400)
def model_evaluate(model, X_test, y_test):
rf_pred = model.predict(X_test)
errors = abs(rf_pred - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('RMSE: {:0.6f}'.format(mean_squared_error(rf_pred, y_test)))
print('MAE: {:0.6f}'.format(mean_absolute_error(rf_pred, y_test)))
print("Pearson correlation: {:0.6f}".format(pearsonr(rf_pred,y_test)[0]))
print("Correlation coefficient: {:0.6f}".format(abs(1 - pearsonr(rf_pred,y_test)[0])))
return accuracy
print('Base model')
base_model = RandomForestRegressor()
base_model.fit(X_train, y_train)
base_accuracy = model_evaluate(base_model, X_test, y_test)
print('')
print('Parameter tuned with random search model')
best_random = rf_random.best_estimator_
random_accuracy = model_evaluate(best_random, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))
print('')
print('Parameter tuned with grid search model')
best_grid = grid_search.best_estimator_
grid_accuracy = model_evaluate(best_grid, X_test, y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))
Base model Model Performance RMSE: 0.040106 MAE: 0.146517 Pearson correlation: 0.122923 Correlation coefficient: 0.877077 Parameter tuned with random search model Model Performance RMSE: 0.039688 MAE: 0.146063 Pearson correlation: 0.177382 Correlation coefficient: 0.822618 Improvement of 0.18%. Parameter tuned with grid search model Model Performance RMSE: 0.039822 MAE: 0.146440 Pearson correlation: 0.176119 Correlation coefficient: 0.823881 Improvement of 0.13%.
It's important to evaluate the stationarity of a time series. SARIMAX works fine with time series that are significantly dependent on seasonality and trend. If the time series is not like that, ARIMA would be a better choice.
Check if the time series is stationary or not by Augmented Dickey-Fuller test:
from statsmodels.tsa.seasonal import seasonal_decompose,STL
tmp = df.temp[df.index>='1990-01-01']
tmp = tmp[tmp.index<='1999-12-01']
stl = STL(tmp, robust=True, period=7)
res = stl.fit()
res.plot()
plt.show()
from statsmodels.tsa.stattools import adfuller
def stationary_test(X):
result = adfuller(X)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
stationary_test(df.temp)
ADF Statistic: -1.343270 p-value: 0.609151 Critical Values: 1%: -3.440 5%: -2.866 10%: -2.569
We see that p-value > 0.05 so the time series is not stationary. Let's difference it:
from statsmodels.tsa.statespace.tools import diff
df_temp_diff = diff(df.temp, seasonal_periods=365)
df_temp_diff.plot(figsize=(10,6))
plt.show()
stationary_test(df_temp_diff)
ADF Statistic: -9.210171 p-value: 0.000000 Critical Values: 1%: -3.440 5%: -2.866 10%: -2.569
acf = plot_acf(df_temp_diff)
pacf = plot_pacf(df_temp_diff)
plt.show()
Here we see similar patterns in both ACF and PACF, which gives no strong evidence to determine AR and MA terms. Two spikes are in both plot indicating that AR(2) and MA(2) might be good candidates. Also, both ACF and PACF somehow tail off to 0 so ARIMA(1,1,1) might also be a good fit.
However, we will use SARIMAX for the original time series and perform a grid search to exhaustively find the best model to guarantee the best result.
# Split data into train-test set
df_train, df_test = train_test_split(df.temp, test_size=0.2, shuffle=False)
# Exogenous variable
exo_train, exo_test = train_test_split(df.amo, test_size=0.2, shuffle=False)
def sarimax(ts,exo,all_param):
results = []
for param in all_param:
model = SARIMAX(ts,
exog = exo,
order=param[0],
seasonal_order=param[1])
res = model.fit()
results.append((res,res.aic,param))
print('SARIMAX{}x{} - AIC:{}'.format(param[0], param[1], round(res.aic,2)))
return results
p,d,q = range(0,3),[1],range(0,3)
P,D,Q,s = range(0,3),[1],range(0,3),[7]
pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, s))
all_param = list(itertools.product(pdq,seasonal_pdq))
all_res = sarimax(df_train,exo_train,all_param)
SARIMAX(0, 1, 0)x(0, 1, 0, 7) - AIC:-317.74 SARIMAX(0, 1, 0)x(0, 1, 1, 7) - AIC:-697.76 SARIMAX(0, 1, 0)x(0, 1, 2, 7) - AIC:-698.66 SARIMAX(0, 1, 0)x(1, 1, 0, 7) - AIC:-473.36 SARIMAX(0, 1, 0)x(1, 1, 1, 7) - AIC:-698.15 SARIMAX(0, 1, 0)x(1, 1, 2, 7) - AIC:-697.53 SARIMAX(0, 1, 0)x(2, 1, 0, 7) - AIC:-564.65 SARIMAX(0, 1, 0)x(2, 1, 1, 7) - AIC:-700.75 SARIMAX(0, 1, 0)x(2, 1, 2, 7) - AIC:-699.67 SARIMAX(0, 1, 1)x(0, 1, 0, 7) - AIC:-439.66 SARIMAX(0, 1, 1)x(0, 1, 1, 7) - AIC:-822.76 SARIMAX(0, 1, 1)x(0, 1, 2, 7) - AIC:-824.35 SARIMAX(0, 1, 1)x(1, 1, 0, 7) - AIC:-591.19 SARIMAX(0, 1, 1)x(1, 1, 1, 7) - AIC:-823.63 SARIMAX(0, 1, 1)x(1, 1, 2, 7) - AIC:-822.33 SARIMAX(0, 1, 1)x(2, 1, 0, 7) - AIC:-682.49 SARIMAX(0, 1, 1)x(2, 1, 1, 7) - AIC:-827.18 SARIMAX(0, 1, 1)x(2, 1, 2, 7) - AIC:-827.76 SARIMAX(0, 1, 2)x(0, 1, 0, 7) - AIC:-438.5 SARIMAX(0, 1, 2)x(0, 1, 1, 7) - AIC:-821.5 SARIMAX(0, 1, 2)x(0, 1, 2, 7) - AIC:-823.69 SARIMAX(0, 1, 2)x(1, 1, 0, 7) - AIC:-589.19 SARIMAX(0, 1, 2)x(1, 1, 1, 7) - AIC:-822.82 SARIMAX(0, 1, 2)x(1, 1, 2, 7) - AIC:-821.19 SARIMAX(0, 1, 2)x(2, 1, 0, 7) - AIC:-680.93 SARIMAX(0, 1, 2)x(2, 1, 1, 7) - AIC:-826.8 SARIMAX(0, 1, 2)x(2, 1, 2, 7) - AIC:-827.37 SARIMAX(1, 1, 0)x(0, 1, 0, 7) - AIC:-437.52 SARIMAX(1, 1, 0)x(0, 1, 1, 7) - AIC:-804.76 SARIMAX(1, 1, 0)x(0, 1, 2, 7) - AIC:-803.97 SARIMAX(1, 1, 0)x(1, 1, 0, 7) - AIC:-581.69 SARIMAX(1, 1, 0)x(1, 1, 1, 7) - AIC:-803.75 SARIMAX(1, 1, 0)x(1, 1, 2, 7) - AIC:-804.55 SARIMAX(1, 1, 0)x(2, 1, 0, 7) - AIC:-668.72 SARIMAX(1, 1, 0)x(2, 1, 1, 7) - AIC:-806.26 SARIMAX(1, 1, 0)x(2, 1, 2, 7) - AIC:-804.85 SARIMAX(1, 1, 1)x(0, 1, 0, 7) - AIC:-439.76 SARIMAX(1, 1, 1)x(0, 1, 1, 7) - AIC:-822.86 SARIMAX(1, 1, 1)x(0, 1, 2, 7) - AIC:-825.4 SARIMAX(1, 1, 1)x(1, 1, 0, 7) - AIC:-589.21 SARIMAX(1, 1, 1)x(1, 1, 1, 7) - AIC:-824.44 SARIMAX(1, 1, 1)x(1, 1, 2, 7) - AIC:-824.65 SARIMAX(1, 1, 1)x(2, 1, 0, 7) - AIC:-681.68 SARIMAX(1, 1, 1)x(2, 1, 1, 7) - AIC:-829.01 SARIMAX(1, 1, 1)x(2, 1, 2, 7) - AIC:-829.08 SARIMAX(1, 1, 2)x(0, 1, 0, 7) - AIC:-459.13 SARIMAX(1, 1, 2)x(0, 1, 1, 7) - AIC:-822.93 SARIMAX(1, 1, 2)x(0, 1, 2, 7) - AIC:-822.72 SARIMAX(1, 1, 2)x(1, 1, 0, 7) - AIC:-595.02 SARIMAX(1, 1, 2)x(1, 1, 1, 7) - AIC:-822.4 SARIMAX(1, 1, 2)x(1, 1, 2, 7) - AIC:-822.6 SARIMAX(1, 1, 2)x(2, 1, 0, 7) - AIC:-684.29 SARIMAX(1, 1, 2)x(2, 1, 1, 7) - AIC:-826.1 SARIMAX(1, 1, 2)x(2, 1, 2, 7) - AIC:-826.71 SARIMAX(2, 1, 0)x(0, 1, 0, 7) - AIC:-436.94 SARIMAX(2, 1, 0)x(0, 1, 1, 7) - AIC:-807.75 SARIMAX(2, 1, 0)x(0, 1, 2, 7) - AIC:-807.45 SARIMAX(2, 1, 0)x(1, 1, 0, 7) - AIC:-582.63 SARIMAX(2, 1, 0)x(1, 1, 1, 7) - AIC:-807.13 SARIMAX(2, 1, 0)x(1, 1, 2, 7) - AIC:-806.46 SARIMAX(2, 1, 0)x(2, 1, 0, 7) - AIC:-669.97 SARIMAX(2, 1, 0)x(2, 1, 1, 7) - AIC:-809.71 SARIMAX(2, 1, 0)x(2, 1, 2, 7) - AIC:-808.87 SARIMAX(2, 1, 1)x(0, 1, 0, 7) - AIC:-509.72 SARIMAX(2, 1, 1)x(0, 1, 1, 7) - AIC:-854.19 SARIMAX(2, 1, 1)x(0, 1, 2, 7) - AIC:-854.45 SARIMAX(2, 1, 1)x(1, 1, 0, 7) - AIC:-645.37 SARIMAX(2, 1, 1)x(1, 1, 1, 7) - AIC:-852.22 SARIMAX(2, 1, 1)x(1, 1, 2, 7) - AIC:-854.8 SARIMAX(2, 1, 1)x(2, 1, 0, 7) - AIC:-728.2 SARIMAX(2, 1, 1)x(2, 1, 1, 7) - AIC:-851.26 SARIMAX(2, 1, 1)x(2, 1, 2, 7) - AIC:-853.25 SARIMAX(2, 1, 2)x(0, 1, 0, 7) - AIC:-524.98 SARIMAX(2, 1, 2)x(0, 1, 1, 7) - AIC:-852.17 SARIMAX(2, 1, 2)x(0, 1, 2, 7) - AIC:-852.29 SARIMAX(2, 1, 2)x(1, 1, 0, 7) - AIC:-646.89 SARIMAX(2, 1, 2)x(1, 1, 1, 7) - AIC:-850.62 SARIMAX(2, 1, 2)x(1, 1, 2, 7) - AIC:-843.24 SARIMAX(2, 1, 2)x(2, 1, 0, 7) - AIC:-719.09 SARIMAX(2, 1, 2)x(2, 1, 1, 7) - AIC:-851.27 SARIMAX(2, 1, 2)x(2, 1, 2, 7) - AIC:-842.96
all_res.sort(key=lambda x: x[1])
res = all_res[0][0]
print('Best model\nParams: {} - AIC: {}'.format(all_res[0][2], all_res[0][1]))
y_pred = res.get_prediction(start='2005-04-01', end = '2017-01-01',exog=exo_test).predicted_mean
Best model Params: ((2, 1, 1), (1, 1, 2, 7)) - AIC: -854.795643829147
mean_absolute_error(df_test, y_pred)
0.1056436010206211
fig = go.Figure( layout=go.Layout(xaxis=dict(title = "Year", color = 'black'),
yaxis=dict(title = "Orders", color = 'black'),
))
fig.add_trace(go.Scatter(
x=df_test.index,
y=df_test,
name='y_true',
line_width = 1.5,
opacity=0.8))
fig.add_trace(go.Scatter(
x=df_test.index,
y=y_pred,
name='y_pred',
line_width = 1.5,
opacity=0.8))
# Use date string to set xaxis range
fig.update_layout(title_text = "Temperature forecast", title_x=0.5, title_font_size = 22,
paper_bgcolor = 'rgb(245, 246, 250)', plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()
import xgboost as xgb
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
early_stopping_rounds=50, #stop if 50 consequent rounds without decrease of error
verbose=False) # Change verbose to True if you want to see it train
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=1000, n_jobs=12, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
xgb.plot_importance(reg, height=0.9)
plt.show()
# Choose only and amo as features
X_train = X_train[['amo','co2']]
X_test = X_test[['amo','co2']]
reg = xgb.XGBRegressor(n_estimators=1000)
reg.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
early_stopping_rounds=50, #stop if 50 consequent rounds without decrease of error
verbose=False)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=1000, n_jobs=12, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
def generate_ypred(X_test):
prediction = reg.predict(X_test)
pred = pd.DataFrame(index=range(0,len(prediction)),columns=['y'])
for i in range(0, len(prediction)):
pred.iloc[i] = prediction[i].astype(float)
pred = pred.apply(pd.to_numeric)
return pred
ypred = generate_ypred(X_test)
mean_absolute_error(ypred, y_test)
0.12947804390544623
fig = go.Figure( layout=go.Layout(xaxis=dict(title = "Year", color = 'black'),
yaxis=dict(title = "Orders", color = 'black'),
))
fig.add_trace(go.Scatter(
x=df_test.index,
y=y_test,
name='y_true',
line_width = 1.5,
opacity=0.8))
fig.add_trace(go.Scatter(
x=df_test.index,
y=ypred.y,
name='y_pred',
line_width = 1.5,
opacity=0.8))
# Use date string to set xaxis range
fig.update_layout(title_text = "Temperature forecast", title_x=0.5, title_font_size = 22,
paper_bgcolor = 'rgb(245, 246, 250)', plot_bgcolor = 'rgb(245, 246, 250)')
fig.show()